Lesson 4


Scatterplots and Perceived Audience Size

Notes:


Scatterplots

Notes:

library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
qplot(age, friend_count, data = pf)


What are some things that you notice right away?

Response:


ggplot Syntax

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + geom_point()


Overplotting

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_jitter(alpha = 1/20) +
  xlim(13, 90)
## Warning: Removed 5199 rows containing missing values (geom_point).

What do you notice in the plot?

Response:


Coord_trans()

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20) +
  xlim(13, 90) + 
  coord_trans(y = "sqrt")
## Warning: Removed 4906 rows containing missing values (geom_point).

Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20) +
  xlim(13, 90) + 
  coord_trans(y = "sqrt")
## Warning: Removed 4906 rows containing missing values (geom_point).

What do you notice?


Alpha and Jitter

Notes:

ggplot(aes(x = age, y = friendships_initiated), data = pf) + 
  geom_jitter(alpha = 1/10) +
  xlim(13, 90)
## Warning: Removed 5185 rows containing missing values (geom_point).


Overplotting and Domain Knowledge

Notes:


Conditional Means

Notes:

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarize(age_groups,
          friend_count_mean = mean(friend_count),
          friend_count_median = median(friend_count),
          n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
## 
##   age friend_count_mean friend_count_median    n
## 1  13          164.7500                74.0  484
## 2  14          251.3901               132.0 1925
## 3  15          347.6921               161.0 2618
## 4  16          351.9371               171.5 3086
## 5  17          350.3006               156.0 3283
## 6  18          331.1663               162.0 5196
library(dplyr)

Create your plot!

library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
names(pf.fc_by_age)
## [1] "age"                 "friend_count_mean"   "friend_count_median"
## [4] "n"
ggplot(aes(age, friend_count_mean), data = pf.fc_by_age) + geom_line()


Overlaying Summaries with Raw Data

Notes:

ggplot(aes(x = age, y=friend_count), data = pf) +
  geom_point(alpha=0.05,
             position= position_jitter(h = 0),
             color = "orange") +
  coord_trans(y = 'sqrt') + 
  geom_line(stat = "summary", fun.y = mean)+
  geom_line(stat="summary", fun.y = quantile, probs = 0.1,
            linetype =2, color="blue") + 
  geom_line(stat="summary", fun.y = quantile, probs = 0.5,
            linetype =2, color="blue") + 
  geom_line(stat="summary", fun.y = quantile, probs = 0.9,
            linetype =2, color="blue")

What are some of your observations of the plot?

Response:


Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

cor.test(pf$friend_count, pf$age)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$friend_count and pf$age
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(pf, cor.test(friend_count, age))
## 
##  Pearson's product-moment correlation
## 
## data:  friend_count and age
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response:


Correlation on Subsets

Notes:

?subset
with(subset(pf, age <= 70)                 , cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.5923, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Correlation Methods

Notes:


Create Scatterplots

Notes:

ggplot(aes(x=www_likes_received, y=likes_received), data = pf) +
  geom_point()


Strong Correlations

Notes:

ggplot(aes(x=www_likes_received, y=likes_received), data=pf) +
  geom_point() +
  xlim(0,quantile(pf$www_likes_received, 0.95)) +
  ylim(0,quantile(pf$likes_received, 0.95)) +
  geom_smooth(method="lm", color="red")
## Warning: Removed 6075 rows containing missing values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

cor.test(pf$www_likes_received,pf$likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1035, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

Response:


Moira on Correlation

Notes:


More Caution with Correlation

Notes:

#install.packages('alr3')
library(alr3)
## Loading required package: car
data(Mitchell)
names(Mitchell)
## [1] "Month" "Temp"

Create your plot!

ggplot(aes(x=Month, y=Temp), data=Mitchell) +
  geom_point()


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

  2. What is the actual correlation of the two variables? (Round to the thousandths place)

cor.test(Mitchell$Month, Mitchell$Temp)
## 
##  Pearson's product-moment correlation
## 
## data:  Mitchell$Month and Mitchell$Temp
## t = 0.8182, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data

Notes:

ggplot(aes(x=(Month%%12), y=Temp), data=Mitchell)+
  geom_point() 

ggplot(aes(x=Month, y=Temp), data=Mitchell) +
  geom_point() +
  scale_x_discrete(breaks = seq(0, 203, 12))


A New Perspective

What do you notice? Response:

Watch the solution video and check out the Instructor Notes! Notes:


Understanding Noise: Age to Age Months

Notes:

pf$age_with_months <- pf$age + (1.0 - pf$dob_month/12)
names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"    "age_with_months"
head(pf$age_with_months)
## [1] 14.08333 14.08333 14.08333 14.00000 14.00000 14.00000

Age with Months Means

age_months_groups <- group_by(pf, age_with_months)
pf.fc_by_age_months <- summarise(age_months_groups,
                                 friend_count_mean=mean(friend_count),
                                 friend_count_median=median(friend_count),
                                 n=n())
pf.fc_by_age_months <- arrange(pf.fc_by_age_months, age_with_months)
head(pf.fc_by_age_months)
## Source: local data frame [6 x 4]
## 
##   age_with_months friend_count_mean friend_count_median  n
## 1        13.16667          46.33333                30.5  6
## 2        13.25000         115.07143                23.5 14
## 3        13.33333         136.20000                44.0 25
## 4        13.41667         164.24242                72.0 33
## 5        13.50000         131.17778                66.0 45
## 6        13.58333         156.81481                64.0 54

Programming Assignment

ggplot(aes(y=friend_count_mean, x=age_with_months),
       data = subset(pf.fc_by_age_months, age_with_months <= 71)) +
  geom_line()


Noise in Conditional Means

ggplot(aes(y=friend_count_mean,x=age_with_months),
       data=filter(pf.fc_by_age_months, age_with_months<=71))+
  geom_line()


Smoothing Conditional Means

Notes:

p1 <- ggplot(aes(age,friend_count_mean), data =subset(pf.fc_by_age, age<=71)) +
  geom_line()+
  geom_smooth()


p2 <-  ggplot(aes(y=friend_count_mean,x=age_with_months),
       data=filter(pf.fc_by_age_months, age_with_months<=71))+
  geom_line()+
  geom_smooth()

p3 <- ggplot(aes(y= friend_count, x = round(age/5)*5),
             data=subset(pf, age<= 71))+
  geom_line(stat="summary", fun.y = mean)


library(gridExtra)
## Loading required package: grid
grid.arrange(p1,p2,p3,ncol=1)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.


Which Plot to Choose?

Notes:


Analyzing Two Variables

Reflection: We learn the comparison two variables in our data. We learned how figure out correlation between 2 variables. We learn how do visualization for analyze two variables. ***

Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!